library(lubridate)
library(Matrix)
library(tidyverse)
library(quanteda)
library(xgboost)
library(stringr)
require(PRROC)
require(caret)
library(text2vec)
library(readr)
paper_dirs<-list.dirs(path = "path_to_newspaper_directories", full.names = TRUE, recursive = FALSE)
#Should only take these features from the NEWSPAPERNAME_data.csv files and combine them:
#nword, nsentence, self_reference, accountability_word_count, investigation_related
#documents_reveal, investigation_duration, series_indicated,
#inf1-8, winner, sum_inf, max_inf, section_score, alt_section_score,
#and the embedding features named X2-X301.
for(paperfolder in paper_dirs){
    setwd(paperfolder)
    paper_data <- read_csv("paper_metadata_topics_sections_fasttext.csv")
    if(paperfolder==paper_dirs[1]){
      classify_data<-paper_data_2
    }else{
      classify_data<-rbind(classify_data,paper_data)
    }
}
setwd("path_to_save_joint_dataset")
write.csv(classify_data,"classify_data_fasttext_embedding.csv")
classify_data%>%filter(date<'2018-01-01')->train_set
classify_data%>%filter(date>='2018-01-01')->val_set
val_set%>%filter(date>='2019-01-01')->test_set
val_set%>%filter(date<'2019-01-01')->val_set
write.csv(train_set,"nnet_train_df.csv")
write.csv(val_set,"nnet_val_df.csv")
write.csv(test_set,"nnet_test_df.csv")
